set more off
pause on
clear
cap clear matrix

***************************************************************************************************************
*   This program is to calculate the moments for estimation.
*Four sets of moment conditions at each age from 18(22) to 65 are chosen to represent the life-cycle profiles.
*1. The labor force participation rate; 
*2. The employment transition rates: from working to working and from non-working to non-working;
*3. The first and second moments of the logarithm of wages;
*4. The correlation of the logarithm of wages between the current age and the next age.
* we use the lnw excluding year fixed effects for both lnw and lnw_fd
* we use lnw_fd instead of lnw_fe
****************************************************************************************************************
cap program drop p_moments
program define p_moments
   * lfpr_mean, lnw_mean, lnw_sd
   preserve
      sort col age
      collapse (mean) work2 work_pt lfpr_mean=work lnw_mean=lnwage lnw_exclyr_mean=lnw_exclyr (sd) lnw_sd=lnwage lnw_sd_exclyr=lnw_exclyr, by(col age)
      sort col age
      save dta_sipp_moments.dta, replace

      sort col age
      by col: gen lfpr_exit = lfpr_mean[_n-1] - lfpr_mean
      drop if missing(lfpr_exit)
      sort col age
      outsheet col age lfpr_exit using raws/raw_sipp_moments_lfpr_exit.raw, nonames nolabel replace
      if (${gvcopy2fdir}==1) {
          cp raws/raw_sipp_moments_lfpr_exit.raw ${gvfdir}, replace
      }

      merge 1:1 col age using raws/dta_sipp_moments_ss.dta, keepusing(rSS frSS) nogenerate
      sort col age
      foreach ivcol in 0 1 {
         scatter lfpr_exit frSS age if col==`ivcol' & age>=50 & age<=80, c(l l) lpattern("l" "_") msymbol(O X D T) ///
              xline(65, lcolor(gs15) lwidth(medthick)) ///
              title("SS receipient") graphregion(color(white))  ///
              legend(row(1) order(1 "lfpr_exit" 2 "frSS"))
         graph export pics/col`ivcol'_lfpr_exit.eps, replace 
      }
   restore

   foreach ivcol in 0 1 {
      preserve
         keep if col==`ivcol' & age>=(22+`ivcol'*4) & age<=65
         sort id age

         xi: xtreg lnwage i.age, fe i(id)
         predict lnw_fe_hat if lnwage<.   // we are NOT using this

         xi: xtreg lnw_exclyr i.age, fe i(id)
         predict lnw_exclyr_fe_hat if lnw_exclyr<.   // we do not use this either

         // this is to generate lnw_FD from lnw_exclyr
         xtset id age
         sort id age
         by id: gen dlnw_exclyr = lnw_exclyr - lnw_exclyr[_n-1] if age==age[_n-1]+1
         by id: gen dlnw_exclyr_base = lnw_exclyr if age==age[_n+1]-1 & !missing(dlnw_exclyr[_n+1])
         gen lnw_exclyr_ft = lnw_exclyr if work==1
         by id: gen dlnw_exclyr_ft = lnw_exclyr_ft - lnw_exclyr_ft[_n-1] if age==age[_n-1]+1
         by id: gen dlnw_exclyr_ft_base = lnw_exclyr_ft if age==age[_n+1]-1 & !missing(dlnw_exclyr_ft[_n+1])

         collapse (mean) lnw_fe_mean=lnw_fe_hat lnw_exclyr_fe_mean=lnw_exclyr_fe_hat  ///
                         dlnw_exclyr dlnw_exclyr_base lnw_exclyr_ft dlnw_exclyr_ft dlnw_exclyr_ft_base, by(col age)
         sort col age

         replace dlnw_exclyr = dlnw_exclyr_base if age==(22+`ivcol'*4) 
         gen lnw_exclyr_fd = sum(dlnw_exclyr) if age>=(22+`ivcol'*4)
         replace dlnw_exclyr_ft = dlnw_exclyr_ft_base if age==(22+`ivcol'*4) 
         gen lnw_exclyr_ft_fd = sum(dlnw_exclyr_ft) if age>=(22+`ivcol'*4)
         
         if (`ivcol'==1) {
            append using dta_sipp_moments_lnwfe.dta
         }
         sort col age
         save dta_sipp_moments_lnwfe.dta, replace
      restore
   }
   preserve
      use dta_sipp_moments.dta, clear
      sort col age
      merge 1:1 col age using dta_sipp_moments_lnwfe.dta, nogenerate
      sort col age 
      save dta_sipp_moments.dta, replace
   restore

   * get other moments
   keep id col age work lnw_exclyr

   gen work_next = .
   gen lnw_exclyr_next = .
   sort id age
   by id: replace work_next = work[_n+1] if age[_n+1] == age+1
   by id: replace lnw_exclyr_next = lnw_exclyr[_n+1] if age[_n+1] == age+1


   * lfpr1to0, lfpr0to1, at each age 18-70
   * base: skip pt; variable: skip pt as well;
   preserve
      keep if age>=18 & age<=70
      drop if work >= .
      collapse (mean) lfpr_trans = work_next, by(col age work)
      reshape wide lfpr_trans, i(col age) j(work)
      ren lfpr_trans1 lfpr1to0
      replace lfpr1to0 = 1 - lfpr1to0
      ren lfpr_trans0 lfpr0to1
      sort col age
      merge 1:1 col age using dta_sipp_moments.dta, nogenerate
      sort col age 
      save dta_sipp_moments.dta, replace
   restore
   * lfpr1to0, lfpr0to1, aggregate 35-50, stored at age=1
   preserve
      keep if age>=35 & age<=50
      drop if work >= .
      collapse (mean) lfpr_trans = work_next, by(col work)
      gen age = 1
      reshape wide lfpr_trans, i(col age) j(work)
      ren lfpr_trans1 lfpr1to0
      replace lfpr1to0 = 1 - lfpr1to0
      ren lfpr_trans0 lfpr0to1
      sort col age
      merge 1:1 col age using dta_sipp_moments.dta, nogenerate
      sort col age 
      save dta_sipp_moments.dta, replace
   restore

   *lnw_correlation. in data, age: 18-65 for col=0 or 22-65 for col=1
   local lvcount = 0
   foreach ivcol in 0 1 {
      sum age if col==`ivcol'
      local ragemin = max(18+`ivcol'*4, r(min))
      local ragemax = min(r(max)-1, 70)
      forvalues iage = `ragemin'/`ragemax' {
         corr lnw_exclyr lnw_exclyr_next if col==`ivcol' & age==`iage'
         preserve
            keep age
            keep if [_n] == 1
            gen lnw_corr = r(rho)
            replace age = `iage'
            gen col = `ivcol'
            if (`lvcount'==0) {
               local lvcount = 1
            }
            else {
               append using dta_sipp_moments_lnwcorr.dta
            }
            sort col age
            save dta_sipp_moments_lnwcorr.dta, replace
         restore
      }
      *lnw_correlation. in data, aggregate over 35-50, stored at age=2
      corr lnw_exclyr lnw_exclyr_next if col==`ivcol' & age>=35 & age<=50
      preserve
         keep age
         keep if [_n] == 1
         replace age = 2
         gen col = `ivcol'
         gen lnw_corr = r(rho)
         append using dta_sipp_moments_lnwcorr.dta
         sort col age 
         save dta_sipp_moments_lnwcorr.dta, replace
      restore
   }
   preserve
      use dta_sipp_moments.dta, clear
      sort col age
      merge 1:1 col age using dta_sipp_moments_lnwcorr.dta, nogenerate
      sort col age
      save dta_sipp_moments.dta, replace
   restore
end


***************************************************************************************************************
*   This program is to calculate the additional moments for estimation.
* the SS (social security) claiming age, PDF and CDF, between age 62 and 70.
*****************************************************************************************************************
cap program drop p_moments_ss
program define p_moments_ss
    preserve
        drop if missing(SS) | SS<0
        *gen rSS = (SS>0 & age>=62) if !missing(SS) & !missing(age)
        gen rSS = (SS>0) if !missing(SS) & !missing(age)
        sort id wave age
        
        by id: gen SS_entry = (rSS==1) if rSS[_n-1]==0
        by id: gen SS_exit = (rSS==0) if rSS[_n-1]==1 
        by id: gen SS_stay = (rSS==1) if rSS[_n-1]==1

        sort col age
        collapse (mean) rSS SS_entry SS_exit SS_stay, by(col age)
        sort col age
        gen frSS = .
        by col: replace frSS = rSS - rSS[_n-1]

        label var rSS "SS recipient"
        label var frSS "First time SS recipient"
        order col age rSS frSS
        
        save raws/dta_sipp_moments_ss.dta, replace

        gen frSSp = frSS
        replace frSSp = 0 if age<62 | age>70
        by col: egen xx = total(frSSp)
        replace frSSp = frSSp / xx
        drop xx

        outsheet col age rSS frSSp using raws/raw_sipp_moments_ss.raw if age>=60, nonames nolabel replace
        if (${gvcopy2fdir}==1) {
            cp raws/raw_sipp_moments_ss.raw ${gvfdir}, replace
        }

        *save dta_sipp_moments_ss.dta, replace
        foreach ivcol in 0 1 {
           scatter rSS age if col==`ivcol' & age>=60 & age<=80, c(l) lpattern("l") msymbol(O X D T) ///
              || scatter frSS age if col==`ivcol' & age>=60 & age<=80, yaxis(2) c(l) lpattern("_") msymbol(S) ///
                xline(65, lcolor(gs15) lwidth(medthick)) ///
                title("SS receipient") graphregion(color(white))  ///
                legend(row(1) order(1 "CDF" 2 "PDF"))
           graph export pics/col`ivcol'_SS_CDF.eps, replace 

           scatter SS_entry SS_exit age if col==`ivcol' & age>=60 & age<=80, c(l) lpattern("l" "_") msymbol(O X D T) || ///
              scatter SS_stay age if col==`ivcol' & age>=60 & age<=80, yaxis(2) connect(l) lpattern("_.") msymbol(S) ///
                xline(65, lcolor(gs15) lwidth(medthick)) ///
                title("SS entry and exit") graphregion(color(white))  ///
                legend(row(1) order(1 "SS entry" 2 "SS exit" 3 "SS stay"))
           graph export pics/col`ivcol'_SS_entry.eps, replace
        }
    restore
end

***************************************************************************************************************
*   This program is to calculate the additional moments for estimation.
* the SSI claiming age, PDF and CDF, between age 62 and 70.
*****************************************************************************************************************
cap program drop p_moments_ssi
program define p_moments_ssi
    preserve
        drop if missing(SSI) | SSI<0
        gen rSSI = (SSI>0) if !missing(SSI) & !missing(age)
        sort id wave age
        
        by id: gen SSI_entry = (rSSI==1) if rSSI[_n-1]==0
        by id: gen SSI_exit = (rSSI==0) if rSSI[_n-1]==1 
        by id: gen SSI_stay = (rSSI==1) if rSSI[_n-1]==1

        sort col age
        collapse (mean) rSSI SSI_entry SSI_exit SSI_stay, by(col age)
        sort col age
        gen frSSI = .
        by col: replace frSSI = rSSI - rSSI[_n-1]

        label var rSSI "SSI recipient"
        label var frSSI "First time SSI recipient"
        order col age rSSI frSSI
        outsheet col age rSSI frSSI using raws/raw_sipp_SSI.raw if age>=40, nonames nolabel replace
        *if (${gvcopy2fdir}==1) {
        *    cp raws/raw_sipp_SSI.raw ${gvfdir}, replace
        *} 
        foreach ivcol in 0 1 {
           scatter rSSI age if col==`ivcol', c(l) lpattern("l") msymbol(O X D T) ||   ///
              scatter frSSI age if col==`ivcol', yaxis(2) c(l) lpattern("_") msymbol(S) ///
                xline(65, lcolor(gs15) lwidth(medthick)) ///
                title("SS receipient") graphregion(color(white))  ///
                legend(row(1) order(1 "CDF" 2 "PDF"))
           graph export pics/col`ivcol'_SSI_CDF.eps, replace 

           scatter SSI_entry SSI_exit age if col==`ivcol', c(l) lpattern("l" "_") msymbol(O X D T) || ///
              scatter SSI_stay age if col==`ivcol', yaxis(2) connect(l) lpattern("_.") msymbol(S) ///
                xline(65, lcolor(gs15) lwidth(medthick)) ///
                title("SSI entry and exit") graphregion(color(white))  ///
                legend(row(1) order(1 "SSI entry" 2 "SSI exit" 3 "SSI stay"))
           graph export pics/col`ivcol'_SSI_entry.eps, replace
        }
    restore
end
*
***************************************************************************************************************
*   This program is to calculate the additional moments for estimation.
* the wage depreciation after unemployment spell
* this is using the orignal data, not collapsed at the year level
*****************************************************************************************************************
cap program drop p_moments_unemp
program define p_moments_unemp
    preserve
        sort col calyr
        merge m:1 col calyr using dta_sipp_lnw_p.dta, nogenerate
        gen lnw_exclyr = lnwage - p

        sort id wave
        drop if emnths==0
        gen dwage1=lnwage-lnwage[_n-1] if id==id[_n-1]
        gen dwage_exclyr1=lnw_exclyr-lnw_exclyr[_n-1] if id==id[_n-1]

        gen dun1=4*(wave-wave[_n-1]-1)+4-emnth if dwage1~=.
        
        gen dwage2=lnwage-lnwage[_n-2] if (id==id[_n-2] & emnth[_n-1]<4) & (wave[_n-1]==wave[_n-2]+1)
        gen dwage_exclyr2=lnw_exclyr-lnw_exclyr[_n-2] if (id==id[_n-2] & emnth[_n-1]<4) & (wave[_n-1]==wave[_n-2]+1)

        gen dun2=4*(wave-wave[_n-1]-1)+(4-emnth)+(4-emnth[_n-1])

        gen dwage=dwage2
        gen dwage_exclyr=dwage_exclyr2

        gen dun=dun2-12
        replace dwage=dwage1 if dwage1~=.
        replace dwage_exclyr=dwage_exclyr1 if dwage1~=.

        replace dun=dun1-12 if dwage1~=.
        
        save dta_temp_unemp.dta, replace

        *reg dwage dun if dun>-7 & dun<7 & col==0
        *reg dwage dun if dun>-7 & dun<7 & age>40 & col==0 
        gen plnw18 = 0
        gen plnw41 = 0
        gen plnw18_exclyr = 0
        gen plnw41_exclyr = 0
        foreach ivcol in 0 1 {
            reg dwage dun if dun>-7 & dun<7 & col==`ivcol' & age>=18+`ivcol'*4 & age<=65
            replace plnw18 = _b[_cons] if col==`ivcol'
            reg dwage dun if dun>-7 & dun<7 & col==`ivcol' & age>40 & age<=65
            replace plnw41 = _b[_cons] if col==`ivcol'
            reg dwage_exclyr dun if dun>-7 & dun<7 & col==`ivcol' & age>=18+`ivcol'*4 & age<=65
            replace plnw18_exclyr = _b[_cons] if col==`ivcol'
            reg dwage_exclyr dun if dun>-7 & dun<7 & col==`ivcol' & age>40 & age<=65
            replace plnw41_exclyr = _b[_cons] if col==`ivcol'
        }
        keep col plnw18* plnw41*
        bys col: keep if [_n] == 1
        sort col
        outsheet col plnw41 using raws/raw_sipp_moments_unemp.raw, nonames nolabel replace  // we use the changes in lnw
        outsheet col plnw41 plnw41_exclyr using raws/raw_sipp_moments_unemp_compare.raw, nonames nolabel replace
        if (${gvcopy2fdir}==1) {
            cp raws/raw_sipp_moments_unemp.raw ${gvfdir}, replace
        }
    restore
end


*****************************************************************
*  ENDof This program is to calculate the moments for estimation.
*****************************************************************

***************************************************************************************************************
*   This program is to calculate the variance-covariance of the sample data
*   moments: 1 lfpr_mean, 2 lnw_mean, 3 lnw_sd, 4 lnw_fe_mean, 5 diff_lfpr
****************************************************************************************************************
cap program drop p_varcov
program define p_varcov
   local lvcol = `1'
   local lvcolinitage = 22+`lvcol'*4
   preserve
      keep if col==`lvcol' & age>=`lvcolinitage' & age<=65
      sort calyr

      * lnw_exclyr means filtering out the year fixed effects
      //xi: xtreg lnw_exclyr i.age, fe i(id)
      //predict lnw_exclyr_fe_hat if lnw_exclyr<.

      ren work lfpr
      ren lnw_exclyr lnw
      //ren lnw_exclyr_fe_hat lnw_fe

      * lnwsd = (lnw - lnw_mean)^2, first we need to get the lnw_mean
      sort col age
      merge m:1 col age using dta_sipp_moments.dta, keepusing(lnw_exclyr_mean lnw_exclyr_fd)
      tab age if _merge == 2
      drop if _merge == 2
      drop _merge
      gen lnwsd = (lnw - lnw_exclyr_mean) ^ 2 if lnw < .

      ren lnw_exclyr_fd lnw_fd

      * we use unconditional moments
      foreach iv in lnw lnwsd lnw_fd {
         replace `iv' = 0 if lfpr == 0
      }

      * In Fortran, stack lfpr_mean from 22 to 65 first, then lnw_mean from 22-65 second, and so on so forth
      * 1 lfpr_mean, 2 lnw_mean, 3 lnw_sd, 4 lnw_fd
      local lvmoms lfpr lnw lnwsd lnw_fd
      keep id age `lvmoms'
      order id age `lvmoms'
      sort id age
      reshape wide `lvmoms', i(id) j(age)

      local lvvarmom ""
      local lvcn = 0
      *pause
      foreach iv of local lvmoms {
         forvalues jv = `lvcolinitage'/65 {
            local lvcnx = `lvcn' + 1
            local lvcn = `lvcnx'
            ren `iv'`jv' mom`lvcn' 
            local lvvarmomx `lvvarmom' mom`lvcn'
            local lvvarmom `lvvarmomx'
         }
      }
      *pause
      order id `lvvarmom'
      *pause 
      global gvvarmom `lvvarmom'
      save dta_var_temp.dta, replace

      * covariance
      pwcov `lvvarmom', save
      *save indicates that four new variables should be created:  pw_t, pw_tk, pw_cov and pw_N. 
      *The first two variables give the row and column indices of the covariance, provided in the
      *  third variable, while pw_N provides the number of observations used to compute that covariance. 
      *These variables are added to the current data set, and must not already exist.
      keep if pw_t < .
      keep pw_t pw_tk pw_cov pw_N

      order pw_t pw_tk pw_cov pw_N
      sort pw_t pw_tk

      save dta_varcov_col`lvcol'.dta, replace

      * variance
      use dta_var_temp.dta, clear
      collapse (sd) ${gvvarmom}
      gen byte id = 1
      reshape long mom, i(id) j(pw_t)
      gen pw_tk = pw_t
      replace mom = mom ^ 2
      ren mom pw_cov
      drop id
      append using dta_varcov_col`lvcol'.dta
      gen col = `lvcol'
      sort col pw_t pw_tk
      save dta_varcov_col`lvcol'.dta, replace
      rm dta_var_temp.dta
   restore
end

********************************************************************
*  ENDof This program is to calculate the var-cov of the samle data.
********************************************************************


*************************
*   the main program
**************************

/****/
cap log close
log using log_usedat.log, replace

local lvrawdir "raws"

global gvfdir "data4fortran/"

global gvcopy2fdir = 1 
* 0 not copy, 1 copy

*******************This is to stop the following code being running **********************************
use hclsdat.dta, clear
*drop if sex==2
keep if ed==12 | ed==16
gen col = (ed==16)

* keep hs (12) or col (16)

********************************
*in the hclsdat.dta, 
*age
*64	36,118	1.01	99.25
*65	19,406	0.55	99.80
*66	6,226	0.17	99.97
*67	1,066	0.03	100.00
********************************
*drop if age<18
*drop if age>65

* even thought our moments start from 22, but our model starts from 18, 
* so we still want to see some comparison between 18 and 22

*keep if age>=18 & age<=70 & calyr<.
keep if age>=18 & calyr<.
drop if age<22 & col==1

sum
tab hrs
*gen work=hrs>0
*replace work=emp_b if hrs==.

************* Clean the data ****************************
* make sure it is the same person, remove duplication
replace id=id*1000+sipp
sort id

by id: gen NID = [_N]
tab NID
drop NID

** keep obs with age difference 1-5 years *
by id: egen agemin = min(age)
by id: egen agemax = max(age)
by id: gen agediff = agemax - agemin
tab agediff
*pause
keep if agediff >= 0 & agediff <= 5
drop agediff agemin agemax

** drop obs with 1+ col difference *
by id: egen colmin = min(col)
by id: egen colmax = max(col)
drop if colmax != colmin
drop colmax colmin
* 229 obs deleted, 943,324 remaining

** max wave per person is 12 before sipp==108 and 15 for sipp==108
*http://www.census.gov/programs-surveys/sipp/tech-documentation/complete-technical-documentation.html
*http://www.nber.org/data/survey-of-income-and-program-participation-sipp-data.html
*1984 9
*1985 8
*1986 7
*1987 7
*1988 6
*1989 3
*1990 8
*1991 8
*1992 9
*1993 9
*1996 12
*2001 9
*2004 12
*2008 15
*

drop if sipp==92 & calyr==95
by id: gen NID = [_N]
tab NID
drop if NID>9 & sipp==84
drop if NID>8 & sipp==85
drop if NID>7 & sipp==86
drop if NID>7 & sipp==87
drop if NID>6 & sipp==88
drop if NID>3 & sipp==89
drop if NID>8 & sipp==90
drop if NID>8 & sipp==91
tab NID if sipp==92
*pause
drop if NID>9 & sipp==92
drop if NID>9 & sipp==93
drop if NID>12 & sipp==96
drop if NID>9 & sipp==101
drop if NID>12 & sipp==104
drop if NID>15 & sipp==108

*pause
drop NID

** keep obs with only 1, 2 or 3 waves per age *
sort id age
by id age: gen NID = [_N]
tab NID
drop if NID > 3
*pause
drop NID

save dta_sipp_4ss_unemp.dta, replace
* in this file, the labor force status is original, not collapsed.

**********************************************************************
*    Construct the labor force participation rates

*Yeah thought its a little complicated
*-) In years in which we sample all three periods if you are working
*   at least two out of three you are working, otherwise you aren't
*-) In years in which we sample only one period, we use that
*-) The complication is years in which we only sample you twice.  
*   If you are working one and not the other, we could just flip a coin

* and marital status, spouse income
*************************************************************************
replace lnwage = . if work==0
replace hrs = . if work==0

ren SSI ssi
replace ssi = . if ssi<0

sort id age calyr  // it is important to sort by calyr as well, as (first) in the next line
collapse (mean) col work hrs lnwage spspres spswk spsinc ssi (first) sipp calyr ed, by(id age)
tab col
assert (col==0 | col==1)

* translate weekly inc into hourly inc
replace spsinc = spsinc / 40

assert !missing(work)

gen work_orig = work
gen spspres_orig = spspres
gen spswk_orig = spswk
* full time, part time, or none
gen byte work_pt = (work>0 & work<=0.55)
label var work_pt "Working Part Time"

* full time or none only
gen byte work2 = (work>0)
replace work = 0 if work < 0.5
replace work = 1 if work > 0.5 & !missing(work) 

set seed 543645
* randomly assign if work==0.5 for transition
*replace work = rbinomial(1,0.5) if work==0.5
* ignore if work==0.5 for transition
replace work = . if work==0.5

replace hrs = . if work == 0
replace lnwage = . if work == 0

*replace spspres = 0 if spspres < 0.5
*replace spspres = 1 if spspres > 0.5 & !missing(spspres)
*replace spspres = rbinomial(1,0.5) if spspres==0.5
replace spspres = 1 if spspres > 0 & !missing(spspres)

*replace spswk = 0 if spswk < 0.5
*replace spswk = 1 if spswk > 0.5 & !missing(spswk)
*replace spswk = rbinomial(1,0.5) if spswk==0.5
replace spswk = 1 if spswk > 0 & !missing(spswk)
replace spswk = . if spspres == 0

label var spspres "Spouse present"
label var spswk "Spouse working"

* filtering out the time fixed effect
foreach ivcol in 0 1 {
   preserve
      keep if calyr <. & col == `ivcol' & age>=22+`ivcol'*4 & age<=65
      xi: reg lnwage i.age i.calyr
      predict p
      keep if age == 30
      collapse (mean) p, by(calyr)
      sort calyr
      gen p10 = p[10]
      replace p = p - p10
      gen col = `ivcol'
      if (`ivcol'==1) {
         append using dta_sipp_lnw_p.dta
      }
      sort col calyr
      save dta_sipp_lnw_p.dta, replace
   restore
}

sort col calyr
merge m:1 col calyr using dta_sipp_lnw_p.dta, nogenerate
sort id age
gen lnw_exclyr = lnwage - p

compress

save dta_sipp_cleaned.dta, replace
* on average each id is observed 4 times, conditional on being observed 2+ times;

*tab spspres
*tab spswk
*pause

/*******/

*******************************************************
* generate moments on SS and wage changes on Unemployment spell

use dta_sipp_4ss_unemp.dta, clear
p_moments_ss
p_moments_ssi
p_moments_unemp
*******************************************************


********************************************************
* generate changes in marital status, 
* and whether wife works or not
********************************************************
* wife's income if working
use dta_sipp_cleaned.dta if spspres==1 & spswk==1 & spsinc>0 & !missing(spsinc), clear

* it is in 2009, deflate to 2004 using CPI;
* in lnw, the year fixed effect is included, which includes the CPI, so no need to deflate;
gen year = 1900 + calyr
sort year
merge m:1 year using "dta_cpi.dta", keepusing(cpi04)
drop if _merge==2
drop _merge
*pause
replace spsinc = spsinc * cpi04

* income tax for the spousal income
 * spsinc is hourly, translate it into annual yearly
gen spsinc_pretax = spsinc * 2000
gen spsinc_posttax = .
do "prog_usa_tax.do"
p_usa_ftax_2004 spsinc_pretax spsinc_posttax
gen spsinc_hrly_pretax = spsinc
replace spsinc = spsinc_posttax / 2000
drop spsinc_pretax spsinc_posttax


* spousal income remove outliers: 5% and 95%
preserve
   collapse (p5) p5inc=spsinc (p95) p95inc=spsinc, by(col age)
   sort col age
   save dta_temp.dta, replace
restore

sort col age
merge m:1 col age using dta_temp.dta, nogenerate
replace spsinc = . if spsinc<=p5inc | spsinc>=p95inc
erase dta_temp.dta

gen spsinc_ln = log(spsinc)

*pause

collapse (mean) spsinc spsinc_ln (sd) sd_spsinc=spsinc sd_spsinc_ln=spsinc_ln, by(col age)
sort col age

gen age2 = age^2/100
gen age3 = age2*age/100
gen age4 = age3*age/100

* smooth

foreach iv in spsinc spsinc_ln sd_spsinc sd_spsinc_ln {
   gen `iv'_bar = .
   foreach ivcol in 0 1 {
      reg `iv' age age2 age3 age4 if col==`ivcol'
      predict `iv'_bar_col`ivcol' if col==`ivcol'
      replace `iv'_bar = `iv'_bar_col`ivcol' if col==`ivcol'
      drop `iv'_bar_col`ivcol'
   }
}
sort col age

save dta_sipp_spinc.dta, replace 
*outsheet age spsinc spsinc_ln sd_spsinc sd_spsinc_ln using `lvrawdir'/raw_sipp_spinc.raw, nonames nolabel replace
outsheet col age spsinc_bar spsinc_ln_bar sd_spsinc_bar sd_spsinc_ln_bar using `lvrawdir'/raw_sipp_spinc.raw, nonames nolabel replace
if (${gvcopy2fdir}==1) {
   cp `lvrawdir'/raw_sipp_spinc.raw ${gvfdir}, replace
}

foreach ivcol in 0 1 {
   line spsinc spsinc_bar sd_spsinc sd_spsinc_bar age if col==`ivcol', lpattern("l" "_" "_." "-") xline(65, lcolor(gs15) lwidth(medthick)) title("Spouse inc") graphregion(color(white)) legend(row(1) order(1 "Mean" 2 "Mean, smooth" 3 "SD" 4 "SD, smooth"))
   graph export pics/col`ivcol'_spouse_inc.eps, replace
   
   line spsinc_ln spsinc_ln_bar sd_spsinc_ln sd_spsinc_ln_bar age if col==`ivcol', lpattern("l" "_" "_." "-") xline(65, lcolor(gs15) lwidth(medthick)) title("Spouse ln(inc)") graphregion(color(white)) legend(row(1) order(1 "Mean" 2 "Mean, smooth" 3 "SD" 4 "SD, smooth"))
   graph export pics/col`ivcol'_spouse_inc_ln.eps, replace
}

************************************************************
* transition of the marital status and spouse working status
************************************************************
foreach ivcol in 0 1 {
   use dta_sipp_cleaned.dta, clear
   keep if col == `ivcol'
   gen byte Mt = 0 if spspres==0
   replace Mt = 1 if spspres==1 & spswk == 0
   replace Mt = 2 if spspres==1 & spswk == 1
   sort id age
   by id: gen Mtn = Mt[_n+1] if age==age[_n+1]-1
   drop if missing(Mtn)
  
   tabulate Mtn, gen(Mtnext)
   ren Mtnext1 Mtnext0
   ren Mtnext2 Mtnext1
   ren Mtnext3 Mtnext2
   *pause
   sort age Mt
   gen age2 = age^2/100
   gen age3 = age2*age/100
   gen age4 = age3*age/100

   foreach iv in 0 1 2 {
      preserve
         keep if Mt==`iv'
         foreach jv in 0 1 2 {
            probit Mtnext`jv' age age2 age3 age4
            predict Mtnext`jv'_bar
         }
         collapse (mean) Mtnext0 Mtnext0_bar Mtnext1 Mtnext1_bar Mtnext2 Mtnext2_bar, by(age Mt)
         sort age Mt
         gen Msum = Mtnext0_bar + Mtnext1_bar + Mtnext2_bar
         foreach jv in 0 1 2 {
            replace Mtnext`jv'_bar = Mtnext`jv'_bar / Msum
         }
         drop Msum
         foreach jv in 0 1 2 {
            line Mtnext`jv' Mtnext`jv'_bar age, lpattern("_" "_." "_._") xline(65, lcolor(gs15) lwidth(medthick)) title("Family status") graphregion(color(white)) legend(row(1) order(1 "Data" 2 "Smoothed"))
            graph export pics/col`ivcol'_spouse_Mt`iv'to`jv'.eps, replace
         }
         gen col = `ivcol'
         if (`iv'!=0) {
            append using dta_sipp_Mt_col`ivcol'.dta
         }
         save dta_sipp_Mt_col`ivcol'.dta, replace
      restore
   }
}


use dta_sipp_Mt_col0.dta, clear
append using dta_sipp_Mt_col1.dta
sort col age Mt
save dta_sipp_Mt.dta, replace 

outsheet col age Mt Mtnext0_bar Mtnext1_bar Mtnext2_bar using `lvrawdir'/raw_sipp_Mt.raw, nonames nolabel replace
if (${gvcopy2fdir}==1) {
    cp `lvrawdir'/raw_sipp_Mt.raw ${gvfdir}, replace
}

* Mtnext0 Mtnext0_bar Mtnext1 Mtnext1_bar Mtnext2 Mtnext2_bar


********************************************************
* generate H space
********************************************************
use dta_sipp_cleaned.dta, clear
gen wage = exp(lnw_exclyr) if lnw_exclyr<.
collapse (p3) w3=wage (p97) w97=wage, by(col age)

sort col age
by col: replace age = age[_n-1] + 1 if age>=.

drop if age>100
gen age2 = age^2
gen age3 = age2 * age
gen age4 = age2 * age2

foreach ivcol in 0 1 {
   reg w3 age age2 age3 if col == `ivcol'
   predict w3hat_col`ivcol' if col == `ivcol'
   reg w97 age age2 age3 if col == `ivcol'
   predict w97hat_col`ivcol' if col == `ivcol'
   scatter w3 w3hat_col`ivcol' age if age<=80 & col==`ivcol'
   graph export pics/col`ivcol'_sipp_wage_p3.eps, replace
   scatter w97 w97hat_col`ivcol' age if age<=80 & col==`ivcol'
   graph export pics/col`ivcol'_sipp_wage_p97.eps, replace
}

gen w3hat = w3hat_col0 if col==0
replace w3hat = w3hat_col1 if col==1
gen w97hat = w97hat_col0 if col==0
replace w97hat = w97hat_col1 if col==1

sort col age
outsheet col age w3hat w97hat using `lvrawdir'/raw_sipp_wage_age.raw, nonames nolabel replace
if (${gvcopy2fdir}==1) {
    cp `lvrawdir'/raw_sipp_wage_age.raw ${gvfdir}, replace
}
*****************************************************************************


*******************************************************************
*   generate moments
***************************************************************************
use dta_sipp_cleaned.dta, clear

set seed 11223344
xtset id age
xtdes
global xtN = r(N)

p_moments

use dta_sipp_moments.dta, clear

*plot the moments
foreach ivcol in 0 1 {
   line lnw_mean lnw_exclyr_mean lnw_fe_mean lnw_exclyr_fe_mean lnw_exclyr_fd lnw_exclyr_ft_fd age if col==`ivcol' & age>=22 & age<=65, graphregion(color(white)) ///
         lpattern("l" "_" "_." "-" "-." "l")  xline(65, lcolor(gs15) lwidth(medthick)) title("SIPP moments")  ///
         legend(ring(0) pos(6) col(1) order(1 "lnw" 2 "lnw_exclyr" 3 "lnw_fe" 4 "lnw_exclyr_fe" 5 "lnw_exclyr_fd" 6 "lnw_exclyr_ft_fd")) 
   graph export pics/col`ivcol'_sipp_moments_lnw.eps, replace

   line lnw_sd lnw_sd_exclyr age if col==`ivcol' & age>=18 & age<=65, graphregion(color(white)) ///
         lpattern("l" "_" "_.")  xline(65, lcolor(gs15) lwidth(medthick)) title("SIPP moments")  ///
         legend(ring(0) pos(6) col(1) order(1 "sdlnw" 2 "sdlnw_exclyr")) 
   graph export pics/col`ivcol'_sipp_moments_sdlnw.eps, replace

   line work2 lfpr_mean age if col==`ivcol' & age>=11, graphregion(color(white)) ///
         lpattern("l" "_" "_.") xline(65, lcolor(gs15) lwidth(medthick)) title("SIPP moments")  ///
         legend(ring(0) pos(6) col(1) order(1 "work2" 2 "lfpr")) 
   graph export pics/col`ivcol'_sipp_moments_work2.eps, replace

   line lfpr_mean work_pt age if col==`ivcol' & age>=11, graphregion(color(white)) ///
         lpattern("l" "_" "_.") xline(65, lcolor(gs15) lwidth(medthick)) title("SIPP moments")  ///
         legend(ring(0) pos(2) col(1) order(1 "lfpr" 2 "lfpr_PT")) 
   graph export pics/col`ivcol'_sipp_moments_lfpr&pt.eps, replace

   foreach iv in lfpr_mean lfpr1to0 lfpr0to1 lnw_mean lnw_exclyr_mean lnw_exclyr_fe_mean lnw_exclyr_fd lnw_exclyr_ft_fd lnw_sd lnw_sd_exclyr lnw_corr work_pt {
      line `iv' age if col==`ivcol' & age>=11, xline(60, lcolor(gs15) lwidth(medthick)) title("SIPP moments")
      graph export pics/col`ivcol'_sipp_moments_`iv'.eps, replace
      replace `iv' = -999 if col==`ivcol' & `iv'>=.
   }
}

sort col age

* we use the lnw_exclyr, lnw_exclyr_ft_fd (lnw_exclyr_fe), both of which exclude year fixed effects;
* _ft_ stands for “Full Time only”;
*outsheet col age lfpr_mean lnw_exclyr_mean lnw_sd_exclyr lnw_exclyr_fd lfpr1to0 lfpr0to1 using `lvrawdir'/raw_sipp_moments_age_v2.raw, nonames nolabel replace
outsheet col age lfpr_mean lnw_exclyr_mean lnw_sd_exclyr lnw_exclyr_ft_fd lfpr1to0 lfpr0to1 using `lvrawdir'/raw_sipp_moments_age_v2.raw, nonames nolabel replace

if (${gvcopy2fdir}==1) {
    cp `lvrawdir'/raw_sipp_moments_age_v2.raw ${gvfdir}, replace
}

outsheet col age work_pt using `lvrawdir'/raw_sipp_moments_age_workpt.raw, nonames nolabel replace
if (${gvcopy2fdir}==1) {
    cp `lvrawdir'/raw_sipp_moments_age_workpt.raw ${gvfdir}, replace
}

/********/


***********************************************************************
*   draw the empirical distribution of observed period
***********************************************************************
use dta_sipp_cleaned.dta, clear

sort col id age
by col id: gen nn = [_N]
keep if nn>1

by col id: gen dage = age - age[_n-1]
by col id: egen dagemax = max(dage)
tab dage
tab dagemax
keep if dagemax==1
sort col id age
by col id: gen agef = age[1]
by col id: keep if [_n]==[_N]
keep if age<=65
drop if (agef<22 & col==0) | (agef<26 & col==1)
gen agespan = age - agef
sort col id
outsheet col id agef age using raws/raw_sipp_ageobs.raw, nonames nolabel replace
if (${gvcopy2fdir}==1) {
    cp raws/raw_sipp_ageobs.raw ${gvfdir}, replace
}
sort id
save dta_sipp_ageobs.dta, replace


*********************************************************************************
* calculate the variance-covariance of the data
*********************************************************************************

use dta_sipp_cleaned.dta, clear
foreach ivcol in 0 1 {
   preserve
      disp "col=`ivcol'"
      keep if col==`ivcol' & age>=(22+`ivcol'*4) & age<=65 & calyr <.
      desc, short
      sort id calyr age
      by id: gen t = [_n]
      xtset id t
      xtdes
      by id: keep if [_n] == [_N]
      tab t
   restore
}

**** calculate the var-cov matrix *******;
capture ssc install pwcov

p_varcov 0
p_varcov 1 

use dta_varcov_col0.dta, clear
append using dta_varcov_col1.dta
sort col pw_t pw_tk
save dta_varcov.dta, replace
replace pw_cov = -999 if pw_cov >= .
outsheet col pw_t pw_tk pw_cov using `lvrawdir'/raw_Cvarcov.raw, nonames replace
cp `lvrawdir'/raw_Cvarcov.raw ${gvfdir}, replace

cap log close

